1 Setup

1.1 Packages

library(tidyverse)
library(cowplot)
library(GGally)
library(heatmaply)
library(sva)
library(limma)
library(biobroom)
library(ggridges)
library(ggthemes)
theme_set(theme_minimal())

1.2 Data

1.2.1 Metabolite Abundances

# Cells #
vf.cell.neg.raw <- read_csv("./data/abundances/p5fa_vpa_exp_hilic_target_cells_negmode_abundances.csv")
vf.cell.pos.raw <- read_csv("./data/abundances/p5fa_vpa_exp_hilic_target_cells_posmode_abundances.csv")
# Media #
vf.med.neg.raw <- read_csv("./data/abundances/p5fa_vpa_exp_hilic_target_media_negmode_abundances.csv")
vf.med.pos.raw <- read_csv("./data/abundances/p5fa_vpa_exp_hilic_target_media_posmode_abundances.csv")

1.2.2 Compound Information

# Cells #
vf.cell.neg.compound.info <- read_csv("./data/compound_info/p5fa_vpa_exp_hilic_target_cells_negmode_cmpnd_info.csv")
vf.cell.pos.compound.info <- read_csv("./data/compound_info/p5fa_vpa_exp_hilic_target_cells_posmode_cmpnd_info.csv")
# Media #
vf.med.neg.compound.info <- read_csv("./data/compound_info/p5fa_vpa_exp_hilic_target_media_negmode_cmpnd_info.csv")
vf.med.pos.compound.info <- read_csv("./data/compound_info/p5fa_vpa_exp_hilic_target_media_posmode_cmpnd_info.csv")
# Kegg/other ID reference #
#cmpnd.id.db <- read_csv("./data/anp_db_compound_kegg.csv")

1.3 Functions

MissingPerSamplePlot <- function(raw.data, start.string) {
  # Counts the number of missing/NA values per sample and
  # percent compounds missing out of total number of compounds per sample
  # Then passes the result into a vertical bar plot, where each 
  # bar represents a single sample and the size of the bar 
  # is the % of compounds missing
  
  counted.na <- raw.data %>%
  select(starts_with(start.string)) %>% 
  mutate(
    count.na = apply(., 1, function(x) sum(is.na(x))),
    percent.na = (count.na / ncol(raw.data %>% select(starts_with(start.string)))) * 100
    ) %>%
  dplyr::select(count.na, percent.na) %>%
  bind_cols(
    raw.data %>% 
      select(Samples, Group)
      ) %>% 
  arrange(percent.na) %>% 
  mutate(f.order = factor(Samples, levels = Samples))
counted.na %>% 
  ggplot(aes(x = f.order, y = percent.na, fill = Group)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 20, color = "gray", size = 1, alpha = 0.8) +
  coord_flip()+
  xlab("Samples") +
  ylab("Percent missing values in sample") +
  theme(axis.text.y = element_text(size = 6)) 
}
MissingPerCompound <- function(raw.data, start.string){
  # Function to count in how many experimental samples each compound is missing
  # Also calculates the % missing:
  # (# NA per compound across all experimental samples) * 100 / (tot num of samples)
  
  raw.data %>% 
  filter(Group == "sample") %>% 
  select(Samples, starts_with(start.string)) %>% 
  gather(key = "Compound", value = "Abundance", -Samples) %>% 
  group_by(Compound) %>% 
  summarise(
    na_count = sum(is.na(Abundance)),
    n_samples = n(),
    percent_na = (na_count * 100 / n_samples)
    ) %>% 
  filter(na_count > 0) %>% 
  arrange(desc(percent_na))
}
ReplaceNAwMinLogTransformMult <- function(raw.dataframe, start.prefix) {
  # Function to replace any NAs in each column with the minimum for that column, 
  # separately for each sample type.
  # NA in negative control samples are replaced by 2.
  # Then data is log2 transformed
  
  # experimental samples #
  smpls <- raw.dataframe %>%
    filter(Group == "sample") %>% 
    dplyr::select(starts_with(start.prefix))
  smpls.min <- lapply(smpls, min, na.rm = TRUE)
  smpls.noNA <- raw.dataframe  %>%
    filter(Group == "sample") %>% 
    dplyr::select(Samples:Plate) %>%
    bind_cols(
      smpls %>%
        replace_na(replace = smpls.min) %>% 
        log2()
      ) 
  # QC samples #
  QC <- raw.dataframe %>%
    filter(Group == "mix") %>% 
    dplyr::select(starts_with(start.prefix))
  QC.min <- lapply(QC, min, na.rm = TRUE)
  # replace the missing values in the QC with the minimum of the QC
  # then take the log
  QC.noNA <- raw.dataframe  %>%
    filter(Group == "mix") %>% 
    dplyr::select(Samples:Plate) %>%
    bind_cols(
      QC %>%
        replace_na(replace = QC.min) %>% 
        log2()
      )
  # not samples or QC #
  other.min <- setNames(
    as.list(
      rep(2, ncol(
        raw.dataframe %>% 
          dplyr::select(starts_with(start.prefix))))
      ),
    colnames(raw.dataframe %>% dplyr::select(starts_with(start.prefix)))
                )
  other.num.log <- raw.dataframe  %>%
    filter(Group != "mix" & Group != "sample") %>% 
    dplyr::select(Samples:Plate) %>%
    bind_cols(
      raw.dataframe %>% 
        filter(Group != "mix" & Group != "sample") %>% 
        dplyr::select(starts_with(start.prefix)) %>%
        replace_na(replace = other.min) %>% 
        log2()
      )
  all.noNA <- smpls.noNA %>% 
    bind_rows(QC.noNA) %>% 
    bind_rows(other.num.log)
}
HeatmapPrepAlt <- function(raw.data, start.prefix){
  # function for preparing dara for heatmap viz
  x <- raw.data %>% 
    select(starts_with(start.prefix)) %>% 
    scale(center = TRUE, scale = TRUE) %>% 
    as.matrix() 
  row.names(x) <- raw.data$Samples
  return(x)
}

2 Data Exploration

2.1 Mass vs Retention Time

Q: What are the distributions of compound masses and retention times?

full.vf.cmpnd <- vf.cell.neg.compound.info %>% 
  mutate(Set = "Cells / Neg") %>% 
  bind_rows(vf.cell.pos.compound.info %>% mutate(Set = "Cells / Pos")) %>% 
  bind_rows(vf.med.neg.compound.info %>% mutate(Set = "Media / Neg")) %>%
  bind_rows(vf.med.pos.compound.info %>% mutate(Set = "Media / Pos"))
full.vf.cmpnd %>% 
  ggplot(aes(x = rt, y = mass)) +
  geom_point(size = 3, alpha = 0.3) +
  xlab("Retention Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA + FA HILIC") +
  ylim(0, 1000)

full.vf.cmpnd %>% 
  ggplot(aes(x = rt, y = mass, color = Set)) +
  geom_point(size = 3, alpha = 0.8) +
  xlab("Retnetion Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA + FA HILIC") +
  facet_wrap(~ Set) +
  ylim(0, 1000)

Q: Which compounds were found in one or more of the data types?

## cell join ##
vf.cell.cmpnd.join <- vf.cell.neg.compound.info %>% 
  inner_join(vf.cell.pos.compound.info, by = "cas_id", suffix = c(".c.n", ".c.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / cells 
print(vf.cell.cmpnd.join$compound_full.c.n)
 [1] "Glycine"                                          
 [2] "Pyruvate"                                         
 [3] "Alanine"                                          
 [4] "Beta-Alanine"                                     
 [5] "Sarcosine"                                        
 [6] "2-Aminobutyric Acid"                              
 [7] "BAIBA"                                            
 [8] "GABA"                                             
 [9] "Serine"                                           
[10] "Hypotaurine"                                      
[11] "Uracil"                                           
[12] "Creatinine"                                       
[13] "Proline"                                          
[14] "Valine"                                           
[15] "Threonine"                                        
[16] "Homoserine"                                       
[17] "Taurine"                                          
[18] "Ketoleucine"                                      
[19] "N-Acetylalanine"                                  
[20] "Creatine"                                         
[21] "Leucine"                                          
[22] "Isoleucine"                                       
[23] "Asparagine"                                       
[24] "Ornithine"                                        
[25] "Aspartic Acid"                                    
[26] "Adenine"                                          
[27] "Glutamine"                                        
[28] "Lysine"                                           
[29] "Glutamic Acid"                                    
[30] "Methionine"                                       
[31] "Xanthine"                                         
[32] "4-Hydroxyphenylacetic Acid"                       
[33] "3-Sulfinoalanine"                                 
[34] "Histidine"                                        
[35] "Allantoin"                                        
[36] "5-Hydroxylysine"                                  
[37] "Phenylalanine"                                    
[38] "Pyridoxal"                                        
[39] "Pyridoxine"                                       
[40] "Glycerol 2-Phosphate"                             
[41] "Arginine"                                         
[42] "Tyrosine"                                         
[43] "D-Galactitol"                                     
[44] "D-Sorbitol"                                       
[45] "Phosphocholine"                                   
[46] "N-alpha-Acetyl-L-glutamine"                       
[47] "Tryptophan"                                       
[48] "Pantothenic Acid"                                 
[49] "Cystathionine"                                    
[50] "Methyl Jasmonate"                                 
[51] "Carnosine"                                        
[52] "Cytidine"                                         
[53] "Uridine"                                          
[54] "D-Glucose 6-phosphate"                            
[55] "D-Fructose 6-phosphate"                           
[56] "Thiamine (Vit B1)"                                
[57] "Inosine"                                          
[58] "Guanosine"                                        
[59] "Ophthalmic Acid"                                  
[60] "5'-Methylthioadenosine"                           
[61] "N-Acetylaspartyl Glutamic Acid"                   
[62] "Glutathione (GSH)"                                
[63] "N-Acetylneuraminic Acid"                          
[64] "UMP"                                              
[65] "3-Phosphoglyceroinositol"                         
[66] "AMP"                                              
[67] "S-Adenosylhomocysteine (SAH)"                     
[68] "CDP"                                              
[69] "ADP"                                              
[70] "GDP"                                              
[71] "UTP"                                              
[72] "ATP"                                              
[73] "GTP"                                              
[74] "Cyclic adenosine diphosphate ribose (cADP-ribose)"
[75] "UDP-N-Acetylgalactosamine"                        
[76] "GSSG"                                             
[77] "NAD"                                              
[78] "NADH"                                             
[79] "NADP"                                             
[80] "Flavin adenine dinucleotide (FAD)"                
[81] "Acetyl-CoA"                                       
# percent of cell / neg compounds found in cell / pos 
round(nrow(vf.cell.cmpnd.join) * 100 / nrow(vf.cell.neg.compound.info), 1)
[1] 58.3
# percent of cell / neg compounds found in cell / pos 
round(nrow(vf.cell.cmpnd.join) * 100 / nrow(vf.cell.pos.compound.info), 1)
[1] 56.2
vf.cell.cmpnd.join %>% 
  select(contains("mass")) %>% 
  ggpairs()

vf.cell.cmpnd.join %>% 
  select(starts_with("rt")) %>% 
  ggpairs()

### Media join ###
vf.med.cmpnd.join <- vf.med.neg.compound.info %>% 
  inner_join(vf.med.pos.compound.info, by = "cas_id", suffix = c(".m.n", ".m.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / media
print(vf.med.cmpnd.join$compound_full.m.n)
 [1] "Alanine"           "Serine"            "Creatinine"       
 [4] "Proline"           "Valine"            "Threonine"        
 [7] "Taurine"           "Creatine"          "Leucine"          
[10] "Isoleucine"        "Glutamine"         "Lysine"           
[13] "Glutamic Acid"     "Methionine"        "Histidine"        
[16] "Allantoin"         "Phenylalanine"     "Pyridoxine"       
[19] "Arginine"          "Citrulline"        "Tyrosine"         
[22] "D-Sorbitol"        "Tryptophan"        "Pantothenic Acid" 
[25] "Thiamine (Vit B1)"
# percent of media / neg compounds found in media / pos
round(nrow(vf.med.cmpnd.join) * 100 / nrow(vf.med.neg.compound.info), 1)
[1] 44.6
# percent of media / pos compounds found in media / neg
round(nrow(vf.med.cmpnd.join) * 100 / nrow(vf.med.pos.compound.info), 1)
[1] 53.2
# vf all match
vf.all.cmpnd.join <- vf.cell.cmpnd.join %>% 
  inner_join(vf.med.cmpnd.join, by = "cas_id") %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
nrow(vf.all.cmpnd.join)
[1] 24
vf.all.cmpnd.join %>% 
  select(contains("mass")) %>% 
  ggpairs()

vf.all.cmpnd.join %>% 
  select(starts_with("rt")) %>% 
  ggpairs()

2.2 Missing Values

Q: Do any of the samples have greater than 20% missing (NA) compound abundances, out of all of the features in the dataset?

MissingPerSamplePlot(vf.cell.neg.raw, "hVPA_FAnC") +
  ggtitle("Missing Per Sample\nVPA + FA HILIC / Cells / Neg Mode")

MissingPerSamplePlot(vf.cell.pos.raw, "hVPA_FApC") +
  ggtitle("Missing Per Sample\nVPA + FA HILIC / Cells / Pos Mode")

MissingPerSamplePlot(vf.med.neg.raw, "hVPA_FAnM") +
  ggtitle("Missing Per Sample\nVPA + FA HILIC / Media / Neg Mode")

MissingPerSamplePlot(vf.med.pos.raw, "hVPA_FApM") +
  ggtitle("Missing Per Sample\nVPA + FA HILIC / Media / Pos Mode")

Q: Are any of the compounds more than 20% missing in the experimental sample group? If there are any, they will be excluded from analysis.

(vf.cell.neg.cmpnd.to.excl <- MissingPerCompound(vf.cell.neg.raw, "hVPA_FAnC") %>% 
  filter(percent_na > 20))
# A tibble: 5 x 4
  Compound     na_count n_samples percent_na
  <chr>           <int>     <int>      <dbl>
1 hVPA_FAnC139       14        22       63.6
2 hVPA_FAnC72         8        22       36.4
3 hVPA_FAnC113        7        22       31.8
4 hVPA_FAnC138        5        22       22.7
5 hVPA_FAnC64         5        22       22.7
MissingPerCompound(vf.cell.pos.raw, "hVPA_FApC") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>
MissingPerCompound(vf.med.neg.raw, "hVPA_FAnM") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>
MissingPerCompound(vf.med.pos.raw, "hVPA_FApM") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>

2.3 Negative Controls and Compound Elimination

2.3.1 Cells / Negative Mode

vf.cell.neg.raw.grp.mean <- vf.cell.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("hVPA_FAnC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vf.cell.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA + FA HILIC / Cells / Negative Mode\nGrouped by sample type")

vf.cell.neg.raw.grp.mean.order <- vf.cell.neg.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vf.cell.neg.raw %>% 
  select(Samples, Group, starts_with("hVPA_FAnC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vf.cell.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA + FA HILIC/ Cells / Negative Mode")

vf.cell.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA + FA HILIC / Cells / Negative Mode")

vf.cell.neg.raw.grp.diff <- vf.cell.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_solv_diff = sample / solv,
    )
vf.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

vf.cell.neg.cmpnd.to.incl <- vf.cell.neg.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>% 
  filter(!(Compound %in% vf.cell.neg.cmpnd.to.excl$Compound))
# original number of metabolites
nrow(vf.cell.neg.raw.grp.diff)
[1] 139
# number of metabolites after filtering 
nrow(vf.cell.neg.cmpnd.to.incl)
[1] 132

2.3.2 Cells / Positive Mode

vf.cell.pos.raw.grp.mean <- vf.cell.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("hVPA_FApC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vf.cell.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA + FA HILIC / Cells / Positive Mode\nGrouped by sample type")

vf.cell.pos.raw.grp.mean.order <- vf.cell.pos.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vf.cell.pos.raw %>% 
  select(Samples, Group, starts_with("hVPA_FApC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vf.cell.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA + FA HILIC / Cells / Positive Mode")

vf.cell.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA + FA HILIC / Cells / Positive Mode")

vf.cell.pos.raw.grp.diff <- vf.cell.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_solv_diff = sample / solv
    )
vf.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vf.cell.pos.cmpnd.to.incl <- vf.cell.pos.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# original number of metabolites
nrow(vf.cell.pos.raw.grp.diff)
[1] 144
# filtered number
nrow(vf.cell.pos.cmpnd.to.incl)
[1] 142

2.3.3 Media / Negative Mode

vf.med.neg.raw.grp.mean <- vf.med.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("hVPA_FAnM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vf.med.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA + FA / Media HILIC / Negative Mode\nGrouped by sample type")

vf.med.neg.raw.grp.mean.order <- vf.med.neg.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vf.med.neg.raw %>% 
  select(Samples, Group, starts_with("hVPA_FAnM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 2) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vf.med.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vf.med.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA + FA HILIC / Media / Negative Mode")

vf.med.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA + FA HILIC / Media / Negative Mode")

vf.med.neg.raw.grp.diff <- vf.med.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_solv_diff = sample / solv
    )
vf.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vf.med.neg.cmpnd.to.incl <- vf.med.neg.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
nrow(vf.med.neg.raw.grp.diff)
[1] 56
nrow(vf.med.neg.cmpnd.to.incl)
[1] 52

2.3.4 Media / Negative Mode

vf.med.pos.raw.grp.mean <- vf.med.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("hVPA_FApM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vf.med.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA + FA HILIC / Media / Positive Mode\nGrouped by sample type")

vf.med.pos.raw.grp.mean.order <- vf.med.pos.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vf.med.pos.raw %>% 
  select(Samples, Group, starts_with("hVPA_FApM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 2) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vf.med.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vf.med.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA + FA HILIC / Media / Positive Mode")

vf.med.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA + FA HILIC / Media / Positive Mode")

vf.med.pos.raw.grp.diff <- vf.med.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_solv_diff = sample / solv
    )
vf.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vf.med.pos.cmpnd.to.incl <- vf.med.pos.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) 
nrow(vf.med.pos.raw.grp.diff)
[1] 47
nrow(vf.med.pos.cmpnd.to.incl)
[1] 47

3 Data Prep and Preliminary Analysis

3.1 Cleanup

vf.cell.neg.noNA <- vf.cell.neg.raw %>% 
  select(Samples:Plate, one_of(vf.cell.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformMult("hVPA_FAnC")
vf.cell.pos.noNA <- vf.cell.pos.raw %>% 
  select(Samples:Plate, one_of(vf.cell.pos.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformMult("hVPA_FApC")
vf.med.neg.noNA <- vf.med.neg.raw %>% 
  select(Samples:Plate, one_of(vf.med.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformMult("hVPA_FAnM")
vf.med.pos.noNA <- vf.med.pos.raw %>% 
  select(Samples:Plate, one_of(vf.med.pos.cmpnd.to.incl$Compound)) %>%  
  ReplaceNAwMinLogTransformMult("hVPA_FApM")

3.2 Distribution plots

3.2.1 Cells / Negative Mode

vf.cell.neg.noNA.gathered <- vf.cell.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vf.cell.neg.noNA) == "hVPA_FAnC10"):ncol(vf.cell.neg.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vf.cell.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA + FA HILIC / Cells / Negative Mode")

# same data format, but as ridge plots
vf.cell.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA + FA HILIC/ Cells / Negative Mode")

# experimental samples only
vf.cell.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA + FA HILIC / Cells / Negative Mode")

# overlay the distributions for another look
vf.cell.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA + FA HILIC / Cells / Negative Mode")

3.2.2 Cells / Positive Mode

vf.cell.pos.noNA.gathered <- vf.cell.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vf.cell.pos.noNA) == "hVPA_FApC1"):ncol(vf.cell.pos.noNA)
    )
vf.cell.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA + FA HILIC / Cells / Positive Mode")

vf.cell.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA + FA HILIC / Cells / Positive Mode")

vf.cell.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA + FA HILIC / Cells / Positive Mode")

vf.cell.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA + FA HILIC / Cells / Positive Mode")

vf.cell.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75, trim = TRUE, adjust = 3/4) +
  xlab("log2(Abundance)") +
  facet_wrap(~ Treatment)

3.2.3 Media / Negative Mode

vf.med.neg.noNA.gathered <- vf.med.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vf.med.neg.noNA) == "hVPA_FAnM10"):ncol(vf.med.neg.noNA)
    )
vf.med.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA + FA HILIC / Media / Negative Mode")

vf.med.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA + FA HILIC / Media / Negative Mode")

vf.med.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA + FA HILIC / Media / Negative Mode")

vf.med.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA + FA HILIC / Media / Negative Mode")

3.2.4 Media / Positive Mode

vf.med.pos.noNA.gathered <- vf.med.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vf.med.pos.noNA) == "hVPA_FApM1"):ncol(vf.med.pos.noNA)
    )
vf.med.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA + FA HILIC / Media / Positive Mode")

vf.med.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA + FA HILIC / Media / Positive Mode")

vf.med.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA + FA HILIC  / Media / Positive Mode")

vf.med.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA + FA HILIC / Media / Positive Mode")

3.3 Principal Component Analysis

Some plots to understand the relationship between the samples, QC samples, solvent, and empty samples in some cases.

3.3.1 Cells / Negative Mode

### PCA on all Samples ###
vf.cell.neg.full.pca <- vf.cell.neg.noNA %>% 
  select(starts_with("hVPA_FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.neg.full.pca$sdev ^ 2) * 100 / sum(vf.cell.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA + FA HILIC / Cells / Negative Mode",
  type = "b"
  )

vf.cell.neg.full.pca.x <- as.data.frame(vf.cell.neg.full.pca$x)
row.names(vf.cell.neg.full.pca.x) <- vf.cell.neg.noNA$Samples
vf.cell.neg.full.pca.x <- vf.cell.neg.full.pca.x %>% 
  bind_cols(vf.cell.neg.noNA %>% select(Group:Plate))
vf.cell.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (96.4% Var)") +
  ylab("PC2 (14.5% Var)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA + FA HILIC / Cells / Negative Mode")

### Samples and mix ###
vf.cell.neg.smpl.mix.pca <- vf.cell.neg.noNA %>% 
  filter(Group == "sample" | Group == "mix") %>% 
  select(starts_with("hVPA_FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.neg.smpl.mix.pca$sdev ^ 2) * 100 / sum(vf.cell.neg.smpl.mix.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and Mix\nVPA + FA HILIC / Cells / Negative Mode",
  type = "b"
  )

vf.cell.neg.smpl.mix.pca.x <- as.data.frame(vf.cell.neg.smpl.mix.pca$x)
vf.cell.neg.smpl.mix.pca.x <- vf.cell.neg.smpl.mix.pca.x %>% 
  bind_cols(
    vf.cell.neg.noNA %>% 
      filter(Group == "sample" | Group == "mix") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.cell.neg.smpl.mix.pca.x) <- vf.cell.neg.smpl.mix.pca.x$Samples
vf.cell.neg.smpl.mix.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (63.1% Var)") +
  ylab("PC2 (14.0% Var)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA + FA HILIC / Cells / Negative Mode")

vf.cell.neg.smpl.mix.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (6.2% Var)") +
  ylab("PC4 (4.7% Var)") +
  ggtitle("Principal Component Analysis\nSamples and mix\nVPA + FA HILIC / Cells / Negative Mode")

### Experimental Samples Only ###
vf.cell.neg.smpl.pca <- vf.cell.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("hVPA_FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.neg.smpl.pca$sdev ^ 2) * 100 / sum(vf.cell.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA + FA HILIC / Cells / Negative Mode",
  type = "b"
  )

vf.cell.neg.smpl.pca.x <- as.data.frame(vf.cell.neg.smpl.pca$x)
vf.cell.neg.smpl.pca.x <- vf.cell.neg.smpl.pca.x %>% 
  bind_cols(
    vf.cell.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.cell.neg.smpl.pca.x) <- vf.cell.neg.smpl.pca.x$Samples
vf.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (65.3% Var)") +
  ylab("PC2 (14.6% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA HILIC / Cells / Negative Mode")

vf.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (5.7% Var)") +
  ylab("PC4 (4.2% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA HILIC/ Cells / Negative Mode")

3.3.2 Cells / Positive Mode

vf.cell.pos.full.pca <- vf.cell.pos.noNA %>% 
  select(starts_with("hVPA_FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.pos.full.pca$sdev ^ 2) * 100 / sum(vf.cell.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA + FA HILIC / Cells / Positive Mode",
  type = "b"
  )

vf.cell.pos.full.pca.x <- as.data.frame(vf.cell.pos.full.pca$x)
row.names(vf.cell.pos.full.pca.x) <- vf.cell.pos.noNA$Samples
vf.cell.pos.full.pca.x <- vf.cell.pos.full.pca.x %>% 
  bind_cols(vf.cell.pos.noNA %>% select(Group:Plate))
vf.cell.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (92.4% Var)") +
  ylab("PC2 (2.7% Var)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA + FA HILIC / Cells / Positive Mode")

### Samples and Mix ###
vf.cell.pos.smpl.mix.pca <- vf.cell.pos.noNA %>% 
  filter(Group == "sample" | Group == "mix") %>% 
  select(starts_with("hVPA_FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.pos.smpl.mix.pca$sdev ^ 2) * 100 / sum(vf.cell.pos.smpl.mix.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and Mix\nVPA + FA HILIC / Cells / Positive Mode",
  type = "b"
  )

vf.cell.pos.smpl.mix.pca.x <- as.data.frame(vf.cell.pos.smpl.mix.pca$x)
vf.cell.pos.smpl.mix.pca.x <- vf.cell.pos.smpl.mix.pca.x %>% 
  bind_cols(
    vf.cell.pos.noNA %>% 
      filter(Group == "sample" | Group == "mix") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.cell.pos.smpl.mix.pca.x) <- vf.cell.pos.smpl.mix.pca.x$Samples
vf.cell.pos.smpl.mix.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_", remove = FALSE) %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (62.9% Var)") +
  ylab("PC2 (13.6% Var)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA + FA / Cells / Positive Mode")

vf.cell.pos.smpl.mix.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (7.2% Var)") +
  ylab("PC4 (3.8% Var)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA + FA HILIC / Cells / Positive Mode")

### Experimental Samples Only ###
vf.cell.pos.smpl.pca <- vf.cell.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("hVPA_FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.pos.smpl.pca$sdev ^ 2) * 100 / sum(vf.cell.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA + FA HILIC/ Cells / Positive Mode",
  type = "b"
  )

vf.cell.pos.smpl.pca.x <- as.data.frame(vf.cell.pos.smpl.pca$x)
vf.cell.pos.smpl.pca.x <- vf.cell.pos.smpl.pca.x %>% 
  bind_cols(
    vf.cell.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.cell.pos.smpl.pca.x) <- vf.cell.pos.smpl.pca.x$Samples
vf.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (61.2% Var)") +
  ylab("PC2 (15.2% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA HILIC / Cells / Positive Mode")

vf.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (7.7% Var)") +
  ylab("PC4 (4.3% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA HILIC / Cells / Positive Mode")

3.3.3 Media / Negative Mode

### PCA on all Samples ###
vf.med.neg.full.pca <- vf.med.neg.noNA %>% 
  select(starts_with("hVPA_FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.neg.full.pca$sdev ^ 2) * 100 / sum(vf.med.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA + FA / Media / Negative Mode",
  type = "b"
  )

vf.med.neg.full.pca.x <- as.data.frame(vf.med.neg.full.pca$x)
row.names(vf.med.neg.full.pca.x) <- vf.med.neg.noNA$Samples
vf.med.neg.full.pca.x <- vf.med.neg.full.pca.x %>% 
  bind_cols(vf.med.neg.noNA %>% select(Group:Plate))
vf.med.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (89.8% Var)") +
  ylab("PC2 (4.7% Var)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA + FA / Media / Negative Mode")

### Samples and Mix ###
vf.med.neg.smpl.mix.pca <- vf.med.neg.noNA %>% 
  filter(Group == "sample" | Group == "mix") %>% 
  select(starts_with("hVPA_FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.neg.smpl.mix.pca$sdev ^ 2) * 100 / sum(vf.med.neg.smpl.mix.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and Mix\nVPA + FA / Media / Negative Mode",
  type = "b"
  )

vf.med.neg.smpl.mix.pca.x <- as.data.frame(vf.med.neg.smpl.mix.pca$x)
vf.med.neg.smpl.mix.pca.x <- vf.med.neg.smpl.mix.pca.x %>% 
  bind_cols(
    vf.med.neg.noNA %>% 
      filter(Group == "sample" | Group == "mix") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.med.neg.smpl.mix.pca.x) <- vf.med.neg.smpl.mix.pca.x$Samples
vf.med.neg.smpl.mix.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (67.1% Var)") +
  ylab("PC2 (19.6% Var)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA + FA / Media / Negative Mode")

vf.med.neg.smpl.mix.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (4.3% Var)") +
  ylab("PC4 (2.1% Var)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA + FA / Media / Negative Mode")

### Experimental Samples Only ###
vf.med.neg.smpl.pca <- vf.med.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("hVPA_FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.neg.smpl.pca$sdev ^ 2) * 100 / sum(vf.med.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA + FA / Media / Negative Mode",
  type = "b"
  )

vf.med.neg.smpl.pca.x <- as.data.frame(vf.med.neg.smpl.pca$x)
vf.med.neg.smpl.pca.x <- vf.med.neg.smpl.pca.x %>% 
  bind_cols(
    vf.med.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.med.neg.smpl.pca.x) <- vf.med.neg.smpl.pca.x$Samples
vf.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (74.4% Var)") +
  ylab("PC2 (12.1% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Media / Negative Mode")

vf.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (4.5% Var)") +
  ylab("PC4 (2.2% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Media / Negative Mode")

3.3.4 Media / Positive Mode

### PCA on all Samples ###
vf.med.pos.full.pca <- vf.med.pos.noNA %>% 
  select(starts_with("hVPA_FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.pos.full.pca$sdev ^ 2) * 100 / sum(vf.med.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA + FA / Media / Positive Mode",
  type = "b"
  )

vf.med.pos.full.pca.x <- as.data.frame(vf.med.pos.full.pca$x)
row.names(vf.med.pos.full.pca.x) <- vf.med.pos.noNA$Samples
vf.med.pos.full.pca.x <- vf.med.pos.full.pca.x %>% 
  bind_cols(vf.med.pos.noNA %>% select(Group:Plate))
vf.med.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (88.6% Var)") +
  ylab("PC2 (4.7% Var)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA + FA / Media / Positive Mode")

### Samples and mix ###
vf.med.pos.smpl.mix.pca <- vf.med.pos.noNA %>% 
  filter(Group == "sample" | Group == "mix") %>% 
  select(starts_with("hVPA_FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.pos.smpl.mix.pca$sdev ^ 2) * 100 / sum(vf.med.pos.smpl.mix.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and Mix\nVPA + FA / Media / Positive Mode",
  type = "b"
  )

vf.med.pos.smpl.mix.pca.x <- as.data.frame(vf.med.pos.smpl.mix.pca$x)
vf.med.pos.smpl.mix.pca.x <- vf.med.pos.smpl.mix.pca.x %>% 
  bind_cols(
    vf.med.pos.noNA %>% 
      filter(Group == "sample" | Group == "mix") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.med.pos.smpl.mix.pca.x) <- vf.med.pos.smpl.mix.pca.x$Samples
vf.med.pos.smpl.mix.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (53.7% Var)") +
  ylab("PC2 (18.9% Var)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA + FA / Media / Positive Mode")

vf.med.pos.smpl.mix.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (8.7% Var)") +
  ylab("PC4 (6.0% Var)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA + FA / Media / Positive Mode")

### Experimental Samples Only ###
vf.med.pos.smpl.pca <- vf.med.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("hVPA_FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.pos.smpl.pca$sdev ^ 2) * 100 / sum(vf.med.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA + FA / Media / Positive Mode",
  type = "b"
  )

vf.med.pos.smpl.pca.x <- as.data.frame(vf.med.pos.smpl.pca$x)
vf.med.pos.smpl.pca.x <- vf.med.pos.smpl.pca.x %>% 
  bind_cols(
    vf.med.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.med.pos.smpl.pca.x) <- vf.med.pos.smpl.pca.x$Samples
vf.med.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (53.8% Var)") +
  ylab("PC2 (19.8% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Media / Positive Mode")

vf.med.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (8.8% Var)") +
  ylab("PC4 (5.8% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Media / Positive Mode")

4 Batch Effects and Signifiance Testing

4.1 Cells / Negative Mode

# sample prep
vf.cell.neg.smpl.data <- vf.cell.neg.noNA %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_")
vf.cell.neg.data.for.sva <- as.matrix(
  vf.cell.neg.smpl.data[, which(
    colnames(vf.cell.neg.smpl.data) == "hVPA_FAnC10"
    ):ncol(vf.cell.neg.smpl.data)]
  )
row.names(vf.cell.neg.data.for.sva) <- vf.cell.neg.smpl.data$Samples
vf.cell.neg.data.for.sva <- t(vf.cell.neg.data.for.sva)
# pheno prep
vf.cell.neg.data.pheno <- as.data.frame(vf.cell.neg.smpl.data[, 5:6])
row.names(vf.cell.neg.data.pheno) <- vf.cell.neg.smpl.data$Samples
# sva calculation
vf.cell.neg.mod.vf <- model.matrix(~ 0 + as.factor(Treatment), data = vf.cell.neg.data.pheno)
vf.cell.neg.mod0 <- model.matrix(~ 1, data = vf.cell.neg.data.pheno)
set.seed(2018)
num.sv(vf.cell.neg.data.for.sva, vf.cell.neg.mod.vf, method = "be")
[1] 1
set.seed(2018)
num.sv(vf.cell.neg.data.for.sva, vf.cell.neg.mod.vf, method = "leek")
[1] 1
set.seed(2018)
vf.cell.neg.sv <- sva(vf.cell.neg.data.for.sva, vf.cell.neg.mod.vf, vf.cell.neg.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
vf.cell.neg.surr.var <- as.data.frame(vf.cell.neg.sv$sv)
colnames(vf.cell.neg.surr.var) <- c("S1")


plot(vf.cell.neg.smpl.pca.x$PC1, vf.cell.pos.smpl.pca.x$PC1)

plot(vf.cell.pos.smpl.pca.x$PC1, vf.cell.neg.surr.var$S1)

vf.cell.neg.covar <- vf.cell.neg.smpl.pca.x %>% 
  select(Samples:Plate, PC1:PC4) %>% 
  bind_cols(vf.cell.neg.surr.var)
vf.cell.neg.covar %>% 
  select(PC1:S1) %>% 
  ggcorr(label = TRUE)

vf.cell.neg.covar %>% 
  select(PC1:S1) %>% 
  ggpairs()

colnames(vf.cell.neg.mod.vf) <- c("fa_only", "cntrl", "fa_and_vpa", "vpa_only")
# combine the full model matrix and the surrogate variables into one
vf.cell.neg.design.sv <- cbind(vf.cell.neg.mod.vf, vf.cell.neg.surr.var)
vf.cell.neg.cont.mat <- makeContrasts(
  vpa_lowFA = vpa_only - cntrl, 
  vpa_highFA = fa_and_vpa - fa_only,
  FA_diff = (fa_and_vpa - fa_only) - (vpa_only - cntrl),
  levels = c("fa_only", "cntrl", "fa_and_vpa", "vpa_only", "S1")
  )
# fit the model/design matrix
vf.cell.neg.eb <- vf.cell.neg.data.for.sva %>% 
  lmFit(vf.cell.neg.design.sv) %>% 
  contrasts.fit(vf.cell.neg.cont.mat) %>% 
  eBayes()
# pull out the results for each metabolite for each comparison
vf.cell.neg.eb.tidy <- tidy(vf.cell.neg.eb) %>% 
  mutate(
    adj_pval = p.adjust(p.value, method = "bonferroni"),
    FC = 2 ^ estimate,
    change_in_vpa = ifelse(FC < 1, "down", "up")
    ) %>% 
  rename(compound_short = gene)
# volcano plot
vf.cell.neg.eb.tidy %>% 
  ggplot(aes(estimate, -log10(adj_pval))) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(linetype = "dashed", color = "#009E73", yintercept = -log10(0.05)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1.2)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1/1.2)) +
  xlim(-2.5, 2.5)  +
  xlab("log2(FC)") +
  ylab("-log10(adjusted p-value)") +
  ggtitle("Volcano plot\nVPA + FA HILIC / Cells / Negative Mode")

# select statistically significant hits with a certain FC:
vf.cell.neg.hits <- vf.cell.neg.eb.tidy %>% 
  filter(adj_pval < 0.05 & (FC > 1.2 | FC < 1/1.2)) %>% 
  inner_join(vf.cell.neg.compound.info, by = "compound_short")
# how many metabolites are significant across the different contrats
table(vf.cell.neg.hits$term)

   FA_diff vpa_highFA  vpa_lowFA 
         0         14         13 
# significant metabolites
sort(unique(vf.cell.neg.hits$compound_full))
 [1] "3-Sulfinoalanine"                      
 [2] "3,4-Dihydroxyphenylacetic Acid (DOPAC)"
 [3] "ATP"                                   
 [4] "BAIBA"                                 
 [5] "Caprylic Acid"                         
 [6] "Creatinine"                            
 [7] "Cystathionine"                         
 [8] "D-Galactitol"                          
 [9] "D-Glucose 6-phosphate"                 
[10] "D-Sorbitol"                            
[11] "Docosahexaenoic Acid (22:6 n-3)"       
[12] "GABA"                                  
[13] "Glutamic Acid"                         
[14] "GSSG"                                  
[15] "myo-Inositol"                          
[16] "N-Acetylglutamic Acid"                 
[17] "N-Acetylserine"                        
[18] "Oleic Acid"                            
[19] "Palmitoleic Acid"                      
[20] "Proline"                               
[21] "Tryptophan"                            
[22] "UDP-N-Acetylgalactosamine"             
[23] "UTP"                                   
#write_csv(vf.cell.neg.hits, path = "./results/vpa_fa_p5_hilic_cell_neg_hits.csv")
vf.cell.neg.hits.tally2 <- vf.cell.neg.hits %>% 
  group_by(compound_short, compound_full) %>% 
  count() %>% 
  filter(n == 2)
vf.cell.neg.lowFA.hits <- vf.cell.neg.hits %>% 
  filter(term == "vpa_lowFA" & !(compound_short %in% vf.cell.neg.hits.tally2$compound_short))
vf.cell.neg.highFA.hits <- vf.cell.neg.hits %>% 
  filter(term == "vpa_highFA" & !(compound_short %in% vf.cell.neg.hits.tally2$compound_short))
vf.cell.neg.both.hits <- vf.cell.neg.hits %>% 
  filter(compound_short %in% vf.cell.neg.hits.tally2$compound_short) %>% 
  arrange(compound_short, term)
vf.cell.neg.both.hits %>% 
  select(compound_full, term, FC) %>% 
  spread(key = "term", value = "FC") %>% 
  ggplot(aes(vpa_lowFA, vpa_highFA)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_abline(intercept = 0, slope = 1, color = "blue", alpha = 0.8)

vf.cell.neg.both.hits %>% 
  select(compound_full, term, FC) %>% 
  spread(key = "term", value = "FC") %>% 
  mutate(diff = vpa_highFA - vpa_lowFA) %>% 
  arrange(diff)
# A tibble: 4 x 4
  compound_full                   vpa_highFA vpa_lowFA    diff
  <chr>                                <dbl>     <dbl>   <dbl>
1 Proline                              2.66      3.00  -0.338 
2 Cystathionine                        2.00      2.25  -0.253 
3 D-Sorbitol                           0.374     0.456 -0.0824
4 Docosahexaenoic Acid (22:6 n-3)      2.94      1.86   1.08  
### Plotting ###
vf.cell.neg.gathered <- vf.cell.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vf.cell.neg.surr.var) %>% 
  select(Samples, VPA, FA, S1, starts_with("hVPA_FAnC")) %>% 
  gather(key = "Compound", value = "Abundance", hVPA_FAnC10:hVPA_FAnC99)
vf.cell.neg.nested <- vf.cell.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vf.cell.neg.modSV.resid <- vf.cell.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, VPA, FA, Compound, .resid) %>% 
  spread(Compound, .resid)
vf.cell.neg.modSV.resid %>% 
  select(Samples:FA, one_of(unique(vf.cell.neg.hits$compound_short))) %>% 
  HeatmapPrepAlt("hVPA_FAnC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA + FA HILIC / Cells / Neg Mode",
   # margins = c(50, 50, 75, 30),
    k_col = 2, k_row = 2,
    labRow = unique(vf.cell.neg.hits$compound_full)
    )
### PCA - cleaned data ###
vf.cell.neg.modSV.pca <- vf.cell.neg.modSV.resid %>% 
  select(starts_with("hVPA_FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vf.cell.neg.modSV.pca.x <- as.data.frame(vf.cell.neg.modSV.pca$x)
row.names(vf.cell.neg.modSV.pca.x) <- vf.cell.neg.modSV.resid$Samples
vf.cell.neg.modSV.pca.x <- vf.cell.neg.modSV.pca.x %>% 
  bind_cols(vf.cell.neg.modSV.resid %>% select(VPA:FA))
vf.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (35.3% Var)") +
  ylab("PC2 (26.4% Var)")

vf.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.6% Var)") +
  ylab("PC4 (5.7% Var)")

4.2 Cells / Positive Mode

vf.cell.pos.smpl.data <- vf.cell.pos.noNA %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_")
vf.cell.pos.data.for.sva <- as.matrix(
  vf.cell.pos.smpl.data[, which(
    colnames(vf.cell.pos.smpl.data) == "hVPA_FApC1"
    ):ncol(vf.cell.pos.smpl.data)]
  )
row.names(vf.cell.pos.data.for.sva) <- vf.cell.pos.smpl.data$Samples
vf.cell.pos.data.for.sva <- t(vf.cell.pos.data.for.sva)
vf.cell.pos.data.pheno <- as.data.frame(vf.cell.pos.smpl.data[, 5:6])
row.names(vf.cell.pos.data.pheno) <- vf.cell.pos.smpl.data$Samples
vf.cell.pos.mod.vf <- model.matrix(~ 0 + as.factor(Treatment), data = vf.cell.pos.data.pheno)
vf.cell.pos.mod0 <- model.matrix(~ 1, data = vf.cell.pos.data.pheno)
set.seed(2018)
num.sv(vf.cell.pos.data.for.sva, vf.cell.pos.mod.vf, method = "be")
[1] 1
set.seed(2018)
num.sv(vf.cell.pos.data.for.sva, vf.cell.pos.mod.vf, method = "leek")
[1] 0
set.seed(2018)
vf.cell.pos.sv <- sva(vf.cell.pos.data.for.sva, vf.cell.pos.mod.vf, vf.cell.pos.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
vf.cell.pos.surr.var <- as.data.frame(vf.cell.pos.sv$sv)
colnames(vf.cell.pos.surr.var) <- c("S1")


plot(vf.cell.pos.smpl.pca.x$PC1, vf.cell.pos.surr.var$S1)

vf.cell.pos.covar <- vf.cell.pos.smpl.pca.x %>% 
  select(Samples:Plate, PC1:PC4) %>% 
  bind_cols(vf.cell.pos.surr.var)
vf.cell.pos.covar %>% 
  select(PC1:S1) %>% 
  ggcorr(label = TRUE)

vf.cell.pos.covar %>% 
  select(PC1:S1) %>% 
  ggpairs()

colnames(vf.cell.pos.mod.vf) <- c("fa_only", "cntrl", "fa_and_vpa", "vpa_only")
vf.cell.pos.design.sv <- cbind(vf.cell.pos.mod.vf, vf.cell.pos.smpl.pca.x$PC1)
colnames(vf.cell.pos.design.sv)[5] <- "PC1"
vf.cell.pos.cont.mat <- makeContrasts(
  vpa_lowFA = vpa_only - cntrl, 
  vpa_highFA = fa_and_vpa - fa_only,
  FA_diff = (fa_and_vpa - fa_only) - (vpa_only - cntrl),
  levels = c("fa_only", "cntrl", "fa_and_vpa", "vpa_only", "PC1")
  )
vf.cell.pos.eb <- vf.cell.pos.data.for.sva %>% 
  lmFit(vf.cell.pos.design.sv) %>% 
  contrasts.fit(vf.cell.pos.cont.mat) %>% 
  eBayes()
vf.cell.pos.eb.tidy <- tidy(vf.cell.pos.eb) %>% 
  mutate(
    adj_pval = p.adjust(p.value, method = "bonferroni"),
    FC = 2 ^ estimate,
    change_in_vpa = ifelse(FC < 1, "down", "up")
    ) %>% 
  rename(compound_short = gene)
vf.cell.pos.eb.tidy %>% 
  ggplot(aes(estimate, -log10(adj_pval))) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(linetype = "dashed", color = "#009E73", yintercept = -log10(0.05)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1.2)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1/1.2)) +
  #xlim(-2.5, 2.5) +
  xlab("log2(FC)") +
  ylab("-log10(adjusted p-value)") +
  ggtitle("Volcano plot\nVPA + FA / Cells / Positive Mode")

vf.cell.pos.hits <- vf.cell.pos.eb.tidy %>% 
  filter(adj_pval < 0.05 & (FC > 1.2 | FC < 1/1.2)) %>% 
  inner_join(vf.cell.pos.compound.info, by = "compound_short")
table(vf.cell.pos.hits$term)

   FA_diff vpa_highFA  vpa_lowFA 
         0         11         23 
sort(unique(vf.cell.pos.hits$compound_full))
 [1] "2'-Deoxyadenosine"              "Acetylputrescine"              
 [3] "Adenosine"                      "AMP"                           
 [5] "Aspartic Acid"                  "CDP-Choline"                   
 [7] "Cystathionine"                  "D-Fructose 6-phosphate"        
 [9] "D-Sorbitol"                     "dGMP"                          
[11] "GDP"                            "Glutathione (GSH)"             
[13] "Glycerol-3-phosphocholine"      "GMP"                           
[15] "Guanosine"                      "Homoserine"                    
[17] "Hypoxanthine"                   "Inosine"                       
[19] "N-Acetylaspartyl Glutamic Acid" "N-Acetylglucosamine"           
[21] "N-Acetylmannosamine"            "N-Methylnicotinate"            
[23] "Nicotinamide"                   "Nicotinamide Mononucleotide"   
[25] "Proline"                        "Pyridoxal"                     
[27] "UMP"                           
vf.cell.pos.hits
# A tibble: 34 x 14
   compound_short term  estimate statistic  p.value   lod adj_pval    FC
   <chr>          <fct>    <dbl>     <dbl>    <dbl> <dbl>    <dbl> <dbl>
 1 hVPA_FApC102   vpa_~    0.803      5.23 4.55e- 5  1.66  1.94e-2 1.74 
 2 hVPA_FApC103   vpa_~    0.453      5.05 6.74e- 5  1.27  2.87e-2 1.37 
 3 hVPA_FApC105   vpa_~   -1.39      -7.86 1.94e- 7  7.19  8.27e-5 0.381
 4 hVPA_FApC107   vpa_~   -1.36      -7.87 1.89e- 7  7.21  8.06e-5 0.390
 5 hVPA_FApC113   vpa_~   -1.04      -5.16 5.35e- 5  1.50  2.28e-2 0.486
 6 hVPA_FApC114   vpa_~   -1.44      -5.06 6.70e- 5  1.27  2.85e-2 0.369
 7 hVPA_FApC117   vpa_~   -1.22      -5.36 3.41e- 5  1.95  1.45e-2 0.428
 8 hVPA_FApC127   vpa_~    0.880      5.14 5.56e- 5  1.46  2.37e-2 1.84 
 9 hVPA_FApC18    vpa_~    1.40      13.1  4.54e-11 15.6   1.94e-8 2.64 
10 hVPA_FApC23    vpa_~    0.398      4.98 7.91e- 5  1.11  3.37e-2 1.32 
# ... with 24 more rows, and 6 more variables: change_in_vpa <chr>,
#   compound_full <chr>, formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>
#write_csv(vf.cell.pos.hits, path = "./results/vpa_fa_p5_hilic_cell_pos_hits.csv")
vf.cell.pos.hits.tally2 <- vf.cell.pos.hits %>% 
  group_by(compound_short, compound_full) %>% 
  count() %>% 
  filter(n == 2)
vf.cell.pos.lowFA.hits <- vf.cell.pos.hits %>% 
  filter(term == "vpa_lowFA" & !(compound_short %in% vf.cell.pos.hits.tally2$compound_short))
vf.cell.pos.highFA.hits <- vf.cell.pos.hits %>% 
  filter(term == "vpa_highFA" & !(compound_short %in% vf.cell.pos.hits.tally2$compound_short))
vf.cell.pos.both.hits <- vf.cell.pos.hits %>% 
  filter(compound_short %in% vf.cell.pos.hits.tally2$compound_short) %>% 
  arrange(compound_short, term)
vf.cell.pos.both.hits %>% 
  select(compound_full, term, FC) %>% 
  spread(key = "term", value = "FC") %>% 
  ggplot(aes(vpa_lowFA, vpa_highFA)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_abline(intercept = 0, slope = 1, color = "blue", alpha = 0.8)

vf.cell.pos.both.hits %>% 
  select(compound_full, term, FC) %>% 
  spread(key = "term", value = "FC") %>% 
  mutate(diff = vpa_highFA - vpa_lowFA) %>% 
  arrange(diff)
# A tibble: 7 x 4
  compound_full               vpa_highFA vpa_lowFA    diff
  <chr>                            <dbl>     <dbl>   <dbl>
1 Proline                          2.16      2.64  -0.485 
2 Cystathionine                    1.53      1.81  -0.283 
3 D-Sorbitol                       0.351     0.469 -0.118 
4 Glycerol-3-phosphocholine        0.434     0.446 -0.0120
5 UMP                              0.453     0.381  0.0717
6 Nicotinamide Mononucleotide      0.463     0.390  0.0730
7 CDP-Choline                      2.28      1.84   0.437 
### Plotting ###
vf.cell.pos.gathered <- vf.cell.pos.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vf.cell.pos.surr.var) %>% 
  select(Samples, VPA, FA, S1, starts_with("hVPA_FApC")) %>% 
  gather(key = "Compound", value = "Abundance", hVPA_FApC1:hVPA_FApC99)
vf.cell.pos.nested <- vf.cell.pos.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vf.cell.pos.modSV.resid <- vf.cell.pos.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, VPA, FA, Compound, .resid) %>% 
  spread(Compound, .resid)
vf.cell.pos.modSV.resid %>% 
  select(Samples:FA, one_of(unique(vf.cell.pos.hits$compound_short))) %>% 
  HeatmapPrepAlt("hVPA_FApC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA + FA HILIC / Cells / Positive Mode",
    margins = c(50, 50, 75, 30),
    k_col = 2, k_row = 2,
    labRow = unique(vf.cell.pos.hits$compound_full)
    )
### PCA - cleaned data ###
vf.cell.pos.modSV.pca <- vf.cell.pos.modSV.resid %>% 
  select(starts_with("hVPA_FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vf.cell.pos.modSV.pca.x <- as.data.frame(vf.cell.pos.modSV.pca$x)
row.names(vf.cell.pos.modSV.pca.x) <- vf.cell.pos.modSV.resid$Samples
vf.cell.pos.modSV.pca.x <- vf.cell.pos.modSV.pca.x %>% 
  bind_cols(vf.cell.pos.modSV.resid %>% select(VPA:FA))
vf.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (66.4% Var)") +
  ylab("PC2 (12.5% Var)")

vf.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (8.1% Var)") +
  ylab("PC4 (3.1% Var)")

4.3 Media / Negative Mode

# sample prep
vf.med.neg.smpl.data <- vf.med.neg.noNA %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_")
vf.med.neg.data.for.sva <- as.matrix(
  vf.med.neg.smpl.data[, which(
    colnames(vf.med.neg.smpl.data) == "hVPA_FAnM10"
    ):ncol(vf.med.neg.smpl.data)]
  )
row.names(vf.med.neg.data.for.sva) <- vf.med.neg.smpl.data$Samples
vf.med.neg.data.for.sva <- t(vf.med.neg.data.for.sva)
# pheno prep
vf.med.neg.data.pheno <- as.data.frame(vf.med.neg.smpl.data[, 5:6])
row.names(vf.med.neg.data.pheno) <- vf.med.neg.smpl.data$Samples
# sva calculation
vf.med.neg.mod.vf <- model.matrix(~ 0 + as.factor(Treatment), data = vf.med.neg.data.pheno)
vf.med.neg.mod0 <- model.matrix(~ 1, data = vf.med.neg.data.pheno)
set.seed(2018)
num.sv(vf.med.neg.data.for.sva, vf.med.neg.mod.vf, method = "be")
[1] 0
set.seed(2018)
num.sv(vf.med.neg.data.for.sva, vf.med.neg.mod.vf, method = "leek")
[1] 0
set.seed(2018)
vf.med.neg.sv <- sva(vf.med.neg.data.for.sva, vf.med.neg.mod.vf, vf.med.neg.mod0)
No significant surrogate variables
# extract the surrogate variables
colnames(vf.med.neg.mod.vf) <- c("fa_only", "cntrl", "fa_and_vpa", "vpa_only")
# combine the full model matrix and the surrogate variables into one
vf.med.neg.design.sv <- vf.med.neg.mod.vf
vf.med.neg.cont.mat <- makeContrasts(
  vpa_lowFA = vpa_only - cntrl, 
  vpa_highFA = fa_and_vpa - fa_only,
  FA_diff = (fa_and_vpa - fa_only) - (vpa_only - cntrl),
  levels = c("fa_only", "cntrl", "fa_and_vpa", "vpa_only")
  )
# fit the model/design matrix
vf.med.neg.eb <- vf.med.neg.data.for.sva %>% 
  lmFit(vf.med.neg.design.sv) %>% 
  contrasts.fit(vf.med.neg.cont.mat) %>% 
  eBayes()
# pull out the results for each metabolite for each comparison
vf.med.neg.eb.tidy <- tidy(vf.med.neg.eb) %>% 
  mutate(
    adj_pval = p.adjust(p.value, method = "bonferroni"),
    FC = 2 ^ estimate,
    change_in_vpa = ifelse(FC < 1, "down", "up")
    ) %>% 
  rename(compound_short = gene)
# volcano plot
vf.med.neg.eb.tidy %>% 
  ggplot(aes(estimate, -log10(adj_pval))) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(linetype = "dashed", color = "#009E73", yintercept = -log10(0.05)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1.2)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1/1.2)) +
  xlab("log2(FC)") +
  ylab("-log10(adjusted p-value)") +
  ggtitle("Volcano plot\nVPA + FA / meds / Negative Mode")

# select statistically significant hits with a certain FC:
vf.med.neg.hits <- vf.med.neg.eb.tidy %>% 
  filter(adj_pval < 0.05 & (FC > 1.2 | FC < 1/1.2)) %>% 
  inner_join(vf.med.neg.compound.info, by = "compound_short")
# how many metabolites are significant across the different contrats
table(vf.med.neg.hits$term)

   FA_diff vpa_highFA  vpa_lowFA 
         0          1          2 
# significant metabolites
sort(unique(vf.med.neg.hits$compound_full))
[1] "Caprylic Acid" "Taurine"      
vf.med.neg.hits
# A tibble: 3 x 14
  compound_short term  estimate statistic  p.value   lod adj_pval    FC
  <chr>          <fct>    <dbl>     <dbl>    <dbl> <dbl>    <dbl> <dbl>
1 hVPA_FAnM14    vpa_~    0.519      6.47 1.12e- 6  3.97 1.74e- 4  1.43
2 hVPA_FAnM21    vpa_~    5.44      68.5  7.15e-29 56.3  1.12e-26 43.3 
3 hVPA_FAnM21    vpa_~    5.48      69.1  5.78e-29 56.5  9.02e-27 44.8 
# ... with 6 more variables: change_in_vpa <chr>, compound_full <chr>,
#   formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>
#write_csv(vf.med.neg.hits, path = "./results/vpa_fa_p5_hilic_media_neg_hits.csv")
vf.med.neg.gathered <- vf.med.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(inter = rep(1, 24)) %>% 
  select(Samples, VPA, FA, inter, starts_with("hVPA_FAnM")) %>% 
  gather(key = "Compound", value = "Abundance", hVPA_FAnM10:hVPA_FAnM9)
vf.med.neg.nested <- vf.med.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ inter, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vf.med.neg.modSV.resid <- vf.med.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, VPA, FA, Compound, .resid) %>% 
  spread(Compound, .resid)

4.4 Media / Positive Mode

# sample prep
vf.med.pos.smpl.data <- vf.med.pos.noNA %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_")
vf.med.pos.data.for.sva <- as.matrix(
  vf.med.pos.smpl.data[, which(
    colnames(vf.med.pos.smpl.data) == "hVPA_FApM1"
    ):ncol(vf.med.pos.smpl.data)]
  )
row.names(vf.med.pos.data.for.sva) <- vf.med.pos.smpl.data$Samples
vf.med.pos.data.for.sva <- t(vf.med.pos.data.for.sva)
vf.med.pos.data.pheno <- as.data.frame(vf.med.pos.smpl.data[, 5:6])
row.names(vf.med.pos.data.pheno) <- vf.med.pos.smpl.data$Samples
vf.med.pos.mod.vf <- model.matrix(~ 0 + as.factor(Treatment), data = vf.med.pos.data.pheno)
vf.med.pos.mod0 <- model.matrix(~ 1, data = vf.med.pos.data.pheno)
set.seed(2018)
num.sv(vf.med.pos.data.for.sva, vf.med.pos.mod.vf, method = "be")
[1] 0
set.seed(2018)
num.sv(vf.med.pos.data.for.sva, vf.med.pos.mod.vf, method = "leek")
[1] 0
set.seed(2018)
vf.med.pos.sv <- sva(vf.med.pos.data.for.sva, vf.med.pos.mod.vf, vf.med.pos.mod0)
No significant surrogate variables
colnames(vf.med.pos.mod.vf) <- c("fa_only", "cntrl", "fa_and_vpa", "vpa_only")
vf.med.pos.design.sv <- vf.med.pos.mod.vf
vf.med.pos.cont.mat <- makeContrasts(
  vpa_lowFA = vpa_only - cntrl, 
  vpa_highFA = fa_and_vpa - fa_only,
  FA_diff = (fa_and_vpa - fa_only) - (vpa_only - cntrl),
  levels = c("fa_only", "cntrl", "fa_and_vpa", "vpa_only")
  )
vf.med.pos.eb <- vf.med.pos.data.for.sva %>% 
  lmFit(vf.med.pos.design.sv) %>% 
  contrasts.fit(vf.med.pos.cont.mat) %>% 
  eBayes()
vf.med.pos.eb.tidy <- tidy(vf.med.pos.eb) %>% 
  mutate(
    adj_pval = p.adjust(p.value, method = "bonferroni"),
    FC = 2 ^ estimate,
    change_in_vpa = ifelse(FC < 1, "down", "up")
    ) %>% 
  rename(compound_short = gene)
# volcano plot
vf.med.pos.eb.tidy %>% 
  ggplot(aes(estimate, -log10(adj_pval))) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(linetype = "dashed", color = "#009E73", yintercept = -log10(0.05)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1.2)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1/1.2)) +
  xlab("log2(FC)") +
  ylab("-log10(adjusted p-value)") +
  ggtitle("Volcano plot\nVPA + FA / meds / posative Mode")

# select statistically significant hits with a certain FC:
vf.med.pos.hits <- vf.med.pos.eb.tidy %>% 
  filter(adj_pval < 0.05 & (FC > 1.2 | FC < 1/1.2)) %>% 
  inner_join(vf.med.pos.compound.info, by = "compound_short")
# how many metabolites are significant across the different contrats
table(vf.med.pos.hits$term)

   FA_diff vpa_highFA  vpa_lowFA 
         0          0          0 
vf.med.pos.gathered <- vf.med.pos.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(inter = rep(1, 24)) %>% 
  select(Samples, VPA, FA, inter, starts_with("hVPA_FApM")) %>% 
  gather(key = "Compound", value = "Abundance", hVPA_FApM1:hVPA_FApM9)
vf.med.pos.nested <- vf.med.pos.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ inter, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vf.med.pos.modSV.resid <- vf.med.pos.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, VPA, FA, Compound, .resid) %>% 
  spread(Compound, .resid)

4.5 Profile plots

### Cell Neg Mode ###
vf.cell.neg.resid.for.profile <- vf.cell.neg.modSV.resid %>% 
  select(Samples:FA, one_of(vf.cell.neg.hits$compound_short)) %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  gather("Compound", value = "Abundance", -c(Samples, Treatment))
vf.cell.neg.resid.for.profile %>% 
  group_by(Samples) %>% 
  count() %>% 
  filter(n != 23)
# A tibble: 0 x 2
# Groups:   Samples [0]
# ... with 2 variables: Samples <chr>, n <int>
vf.cell.neg.resid.avgs <- vf.cell.neg.resid.for.profile %>% 
  group_by(Compound, Treatment) %>% 
  summarize(avg.abun = mean(Abundance)) %>% 
  ungroup()
vf.cell.neg.resid.order <- vf.cell.neg.resid.avgs %>% 
  separate(Treatment, into = c("VPA", "FA"), sep = "_") %>% 
  spread(key = "VPA", value = "avg.abun") %>% 
  mutate(FC = vpa - cntrl) %>% 
  group_by(Compound) %>% 
  summarize(avg_FC = mean(FC)) %>% 
  arrange(avg_FC) %>% 
  mutate(compound_sort = factor(Compound)) %>% 
  inner_join(
    vf.cell.neg.hits %>% 
      select(compound_short, compound_full:cas_id) %>% 
      distinct(), 
    by = c("Compound" = "compound_short")
    )
vf.cell.neg.modSV.resid.prof.plot <- vf.cell.neg.resid.for.profile %>% 
  mutate(compound_order = factor(Compound, levels = vf.cell.neg.resid.order$compound_sort)) %>% 
  ggplot(aes(compound_order, Abundance, color = Treatment, group = Samples)) + 
  geom_line(alpha = 0.3, size = 1.5) + 
  theme(
    axis.ticks.x = element_blank(),
    legend.title = element_blank(),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    ) +
  scale_x_discrete(name = "Compound", labels = vf.cell.neg.resid.order$compound_full) +
  scale_color_manual(values = c("#56B4E9", "#0072B2", "#E69F00",  "#D55E00"), labels = c("Control", "5X FA", "VPA", "VPA + 5X FA")) +
  ylab("log2(Compound Resid)") +
  # overlay the group averages 
  geom_line(
    data = vf.cell.neg.resid.avgs,
    aes(Compound, avg.abun, color = Treatment, group = Treatment),
    size = 2
    ) +
  theme(plot.margin = margin(5, 5, 5, 40))
vf.cell.neg.modSV.resid.prof.plot

### Cell Pos Mode ###
vf.cell.pos.resid.for.profile <- vf.cell.pos.modSV.resid %>% 
  select(Samples:FA, one_of(vf.cell.pos.hits$compound_short)) %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  gather("Compound", value = "Abundance", -c(Samples, Treatment))
vf.cell.pos.resid.for.profile %>% 
  group_by(Samples) %>% 
  count() %>% 
  filter(n != 27)
# A tibble: 0 x 2
# Groups:   Samples [0]
# ... with 2 variables: Samples <chr>, n <int>
vf.cell.pos.resid.avgs <- vf.cell.pos.resid.for.profile %>% 
  group_by(Compound, Treatment) %>% 
  summarize(avg.abun = mean(Abundance)) %>% 
  ungroup()
vf.cell.pos.resid.order <- vf.cell.pos.resid.avgs %>% 
  separate(Treatment, into = c("VPA", "FA"), sep = "_") %>% 
  spread(key = "VPA", value = "avg.abun") %>% 
  mutate(FC = vpa - cntrl) %>% 
  group_by(Compound) %>% 
  summarize(avg_FC = mean(FC)) %>% 
  arrange(avg_FC) %>% 
  mutate(compound_sort = factor(Compound)) %>% 
  inner_join(
    vf.cell.pos.hits %>% 
      select(compound_short, compound_full:cas_id) %>% 
      distinct(), 
    by = c("Compound" = "compound_short")
    )
vf.cell.pos.modSV.resid.prof.plot <- vf.cell.pos.resid.for.profile %>% 
  mutate(compound_order = factor(Compound, levels = vf.cell.pos.resid.order$compound_sort)) %>% 
  ggplot(aes(compound_order, Abundance, color = Treatment, group = Samples)) + 
  geom_line(alpha = 0.3, size = 1.5) + 
  theme(
    axis.ticks.x = element_blank(),
    legend.title = element_blank(),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    ) +
  scale_x_discrete(name = "Compound", labels = vf.cell.pos.resid.order$compound_full) +
  scale_color_manual(values = c("#56B4E9", "#0072B2", "#E69F00",  "#D55E00"), labels = c("Control", "5X FA", "VPA", "VPA + 5X FA")) +
  ylab("log2(Compound Resid)") +
  # overlay the group averages 
  geom_line(
    data = vf.cell.pos.resid.avgs,
    aes(Compound, avg.abun, color = Treatment, group = Treatment),
    size = 2
    ) +
  theme(plot.margin = margin(5, 5, 5, 40))
vf.cell.pos.modSV.resid.prof.plot

vf.cell.resid.prof.grid.plot <- plot_grid(
  vf.cell.neg.modSV.resid.prof.plot,
  vf.cell.pos.modSV.resid.prof.plot,
  labels = c("A", "B"),
  ncol = 1
)
#save_plot("./results/vf_p5_cell_resid_profile_grid_plot.png", vf.cell.resid.prof.grid.plot, base_height = 10, base_width = 8)

4.6 Hits tallies

vf.cell.neg.both.hits
## # A tibble: 8 x 14
##   compound_short term  estimate statistic  p.value   lod adj_pval    FC
##   <chr>          <fct>    <dbl>     <dbl>    <dbl> <dbl>    <dbl> <dbl>
## 1 hVPA_FAnC115   vpa_~    1.56      10.9  9.67e-10 12.5   3.83e-7 2.94 
## 2 hVPA_FAnC115   vpa_~    0.894      5.58 2.04e- 5  2.58  8.08e-3 1.86 
## 3 hVPA_FAnC19    vpa_~    1.41      10.8  1.10e- 9 12.3   4.36e-7 2.66 
## 4 hVPA_FAnC19    vpa_~    1.59      10.8  1.15e- 9 12.4   4.57e-7 3.00 
## 5 hVPA_FAnC78    vpa_~   -1.42     -14.9  4.45e-12 17.9   1.76e-9 0.374
## 6 hVPA_FAnC78    vpa_~   -1.13     -10.5  1.75e- 9 12.0   6.91e-7 0.456
## 7 hVPA_FAnC90    vpa_~    1.00       5.18 4.93e- 5  1.42  1.95e-2 2.00 
## 8 hVPA_FAnC90    vpa_~    1.17       5.40 3.04e- 5  2.18  1.20e-2 2.25 
## # ... with 6 more variables: change_in_vpa <chr>, compound_full <chr>,
## #   formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>
vf.cell.pos.both.hits
## # A tibble: 14 x 14
##    compound_short term  estimate statistic  p.value   lod adj_pval    FC
##    <chr>          <fct>    <dbl>     <dbl>    <dbl> <dbl>    <dbl> <dbl>
##  1 hVPA_FApC105   vpa_~   -1.14      -7.34 5.30e- 7  5.91 2.26e- 4 0.453
##  2 hVPA_FApC105   vpa_~   -1.39      -7.86 1.94e- 7  7.19 8.27e- 5 0.381
##  3 hVPA_FApC107   vpa_~   -1.11      -7.32 5.50e- 7  5.87 2.34e- 4 0.463
##  4 hVPA_FApC107   vpa_~   -1.36      -7.87 1.89e- 7  7.21 8.06e- 5 0.390
##  5 hVPA_FApC127   vpa_~    1.19       7.88 1.86e- 7  6.98 7.94e- 5 2.28 
##  6 hVPA_FApC127   vpa_~    0.880      5.14 5.56e- 5  1.46 2.37e- 2 1.84 
##  7 hVPA_FApC18    vpa_~    1.11      11.8  2.81e-10 13.6  1.20e- 7 2.16 
##  8 hVPA_FApC18    vpa_~    1.40      13.1  4.54e-11 15.6  1.94e- 8 2.64 
##  9 hVPA_FApC69    vpa_~   -1.51     -18.3  1.11e-13 21.6  4.71e-11 0.351
## 10 hVPA_FApC69    vpa_~   -1.09     -11.7  3.45e-10 13.6  1.47e- 7 0.469
## 11 hVPA_FApC83    vpa_~    0.614      5.59 2.04e- 5  2.18 8.70e- 3 1.53 
## 12 hVPA_FApC83    vpa_~    0.859      6.88 1.35e- 6  5.22 5.73e- 4 1.81 
## 13 hVPA_FApC91    vpa_~   -1.20     -15.1  3.77e-12 18.0  1.60e- 9 0.434
## 14 hVPA_FApC91    vpa_~   -1.16     -12.8  6.54e-11 15.2  2.79e- 8 0.446
## # ... with 6 more variables: change_in_vpa <chr>, compound_full <chr>,
## #   formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>
vf.cell.neg.lowFA.hits
## # A tibble: 9 x 14
##   compound_short term  estimate statistic p.value   lod adj_pval    FC
##   <chr>          <fct>    <dbl>     <dbl>   <dbl> <dbl>    <dbl> <dbl>
## 1 hVPA_FAnC127   vpa_~    0.918      5.33 3.55e-5 2.02   0.0140  1.89 
## 2 hVPA_FAnC128   vpa_~    0.812      4.99 7.61e-5 1.26   0.0301  1.76 
## 3 hVPA_FAnC133   vpa_~    0.594      5.02 7.14e-5 1.32   0.0283  1.51 
## 4 hVPA_FAnC134   vpa_~    0.845      4.82 1.11e-4 0.879  0.0440  1.80 
## 5 hVPA_FAnC18    vpa_~    0.669      5.67 1.67e-5 2.78   0.00662 1.59 
## 6 hVPA_FAnC39    vpa_~    1.17       5.33 3.54e-5 2.02   0.0140  2.24 
## 7 hVPA_FAnC50    vpa_~    1.72       5.05 6.70e-5 1.39   0.0265  3.30 
## 8 hVPA_FAnC65    vpa_~   -0.588     -5.28 4.01e-5 1.90   0.0159  0.665
## 9 hVPA_FAnC83    vpa_~    0.723      5.68 1.64e-5 2.80   0.00650 1.65 
## # ... with 6 more variables: change_in_vpa <chr>, compound_full <chr>,
## #   formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>
vf.cell.neg.highFA.hits
## # A tibble: 10 x 14
##    compound_short term  estimate statistic p.value   lod adj_pval    FC
##    <chr>          <fct>    <dbl>     <dbl>   <dbl> <dbl>    <dbl> <dbl>
##  1 hVPA_FAnC10    vpa_~    0.436      4.99 7.61e-5 0.981 0.0301   1.35 
##  2 hVPA_FAnC103   vpa_~    0.939      7.12 7.97e-7 5.61  0.000316 1.92 
##  3 hVPA_FAnC11    vpa_~    0.431      4.95 8.29e-5 0.895 0.0328   1.35 
##  4 hVPA_FAnC43    vpa_~   -0.889     -4.92 9.00e-5 0.812 0.0356   0.540
##  5 hVPA_FAnC44    vpa_~    0.485      4.98 7.76e-5 0.961 0.0307   1.40 
##  6 hVPA_FAnC74    vpa_~   -0.544     -4.97 8.07e-5 0.922 0.0319   0.686
##  7 hVPA_FAnC77    vpa_~   -0.350     -4.87 9.94e-5 0.711 0.0394   0.785
##  8 hVPA_FAnC87    vpa_~    0.394      5.07 6.42e-5 1.15  0.0254   1.31 
##  9 hVPA_FAnC96    vpa_~    0.914      6.55 2.53e-6 4.43  0.00100  1.88 
## 10 hVPA_FAnC98    vpa_~    1.93       6.41 3.45e-6 4.12  0.00137  3.81 
## # ... with 6 more variables: change_in_vpa <chr>, compound_full <chr>,
## #   formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>
vf.cell.pos.lowFA.hits
## # A tibble: 16 x 14
##    compound_short term  estimate statistic p.value   lod adj_pval    FC
##    <chr>          <fct>    <dbl>     <dbl>   <dbl> <dbl>    <dbl> <dbl>
##  1 hVPA_FApC102   vpa_~    0.803      5.23 4.55e-5 1.66   0.0194  1.74 
##  2 hVPA_FApC103   vpa_~    0.453      5.05 6.74e-5 1.27   0.0287  1.37 
##  3 hVPA_FApC113   vpa_~   -1.04      -5.16 5.35e-5 1.50   0.0228  0.486
##  4 hVPA_FApC114   vpa_~   -1.44      -5.06 6.70e-5 1.27   0.0285  0.369
##  5 hVPA_FApC117   vpa_~   -1.22      -5.36 3.41e-5 1.95   0.0145  0.428
##  6 hVPA_FApC23    vpa_~    0.398      4.98 7.91e-5 1.11   0.0337  1.32 
##  7 hVPA_FApC24    vpa_~   -0.824     -5.47 2.66e-5 2.21   0.0113  0.565
##  8 hVPA_FApC35    vpa_~    0.828      5.92 1.00e-5 3.19   0.00427 1.78 
##  9 hVPA_FApC39    vpa_~   -1.18      -5.97 8.95e-6 3.31   0.00381 0.443
## 10 hVPA_FApC59    vpa_~   -0.647     -4.82 1.15e-4 0.730  0.0491  0.639
## 11 hVPA_FApC81    vpa_~   -0.805     -5.04 6.98e-5 1.23   0.0297  0.572
## 12 hVPA_FApC82    vpa_~   -0.405     -5.64 1.81e-5 2.59   0.00773 0.755
## 13 hVPA_FApC90    vpa_~   -1.26      -5.34 3.56e-5 1.91   0.0152  0.419
## 14 hVPA_FApC96    vpa_~   -1.03      -5.74 1.48e-5 2.80   0.00631 0.491
## 15 hVPA_FApC97    vpa_~   -1.13      -5.18 5.08e-5 1.55   0.0216  0.456
## 16 hVPA_FApC98    vpa_~   -1.30      -4.90 9.60e-5 0.913  0.0409  0.406
## # ... with 6 more variables: change_in_vpa <chr>, compound_full <chr>,
## #   formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>
vf.cell.pos.highFA.hits
## # A tibble: 4 x 14
##   compound_short term  estimate statistic p.value   lod adj_pval    FC
##   <chr>          <fct>    <dbl>     <dbl>   <dbl> <dbl>    <dbl> <dbl>
## 1 hVPA_FApC125   vpa_~   -0.504     -5.88 1.08e-5  2.82  0.00462 0.705
## 2 hVPA_FApC28    vpa_~   -0.580     -5.41 3.03e-5  1.78  0.0129  0.669
## 3 hVPA_FApC40    vpa_~   -0.423     -5.95 9.34e-6  2.97  0.00398 0.746
## 4 hVPA_FApC92    vpa_~    0.481      5.41 3.02e-5  1.78  0.0129  1.40 
## # ... with 6 more variables: change_in_vpa <chr>, compound_full <chr>,
## #   formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>
vf.cell.neg.both.hits %>% 
  semi_join(vf.cell.pos.both.hits, by = "compound_full") %>% 
  nrow() / 2
## [1] 3
vf.cell.neg.both.hits %>% 
  semi_join(vf.cell.pos.highFA.hits, by = "compound_full")
## # A tibble: 0 x 14
## # ... with 14 variables: compound_short <chr>, term <fct>, estimate <dbl>,
## #   statistic <dbl>, p.value <dbl>, lod <dbl>, adj_pval <dbl>, FC <dbl>,
## #   change_in_vpa <chr>, compound_full <chr>, formula <chr>, mass <dbl>,
## #   rt <dbl>, cas_id <chr>
vf.cell.neg.both.hits %>% 
  semi_join(vf.cell.pos.lowFA.hits, by = "compound_full") %>% 
  nrow() / 2
## [1] 0
vf.cell.neg.both.hits %>% 
  anti_join(vf.cell.pos.highFA.hits, by = "compound_full") %>% 
  anti_join(vf.cell.pos.lowFA.hits, by = "compound_full") %>%
  anti_join(vf.cell.pos.both.hits, by = "compound_full") %>% 
  nrow() / 2
## [1] 1
vf.cell.neg.lowFA.hits %>% 
  semi_join(vf.cell.pos.lowFA.hits, by = "compound_full") %>% 
  nrow()
## [1] 0
vf.cell.neg.lowFA.hits %>% 
  semi_join(vf.cell.pos.highFA.hits, by = "compound_full") %>% 
  nrow()
## [1] 0
vf.cell.neg.lowFA.hits %>% 
  anti_join(vf.cell.pos.both.hits, by = "compound_full") %>% 
  anti_join(vf.cell.pos.lowFA.hits, by = "compound_full") %>% 
  anti_join(vf.cell.pos.highFA.hits, by = "compound_full") %>% 
  nrow()
## [1] 9
vf.cell.pos.both.hits %>% 
  semi_join(vf.cell.neg.highFA.hits, by = "compound_full")
## # A tibble: 0 x 14
## # ... with 14 variables: compound_short <chr>, term <fct>, estimate <dbl>,
## #   statistic <dbl>, p.value <dbl>, lod <dbl>, adj_pval <dbl>, FC <dbl>,
## #   change_in_vpa <chr>, compound_full <chr>, formula <chr>, mass <dbl>,
## #   rt <dbl>, cas_id <chr>
vf.cell.pos.highFA.hits %>% 
  semi_join(vf.cell.neg.highFA.hits, by = "compound_full")
## # A tibble: 0 x 14
## # ... with 14 variables: compound_short <chr>, term <fct>, estimate <dbl>,
## #   statistic <dbl>, p.value <dbl>, lod <dbl>, adj_pval <dbl>, FC <dbl>,
## #   change_in_vpa <chr>, compound_full <chr>, formula <chr>, mass <dbl>,
## #   rt <dbl>, cas_id <chr>
vf.cell.neg.highFA.hits %>% 
  anti_join(vf.cell.pos.both.hits, by = "compound_full") %>% 
  anti_join(vf.cell.pos.highFA.hits, by = "compound_full") %>% 
  anti_join(vf.cell.pos.lowFA.hits, by = "compound_full") %>% 
  nrow()
## [1] 10
vf.cell.pos.highFA.hits %>% 
  anti_join(vf.cell.neg.both.hits, by = "compound_full") %>% 
  anti_join(vf.cell.neg.highFA.hits, by = "compound_full") %>% 
  anti_join(vf.cell.neg.lowFA.hits, by = "compound_full") %>% 
  nrow()
## [1] 4
vf.cell.pos.lowFA.hits %>% 
  semi_join(vf.cell.neg.highFA.hits, by = "compound_full")
## # A tibble: 0 x 14
## # ... with 14 variables: compound_short <chr>, term <fct>, estimate <dbl>,
## #   statistic <dbl>, p.value <dbl>, lod <dbl>, adj_pval <dbl>, FC <dbl>,
## #   change_in_vpa <chr>, compound_full <chr>, formula <chr>, mass <dbl>,
## #   rt <dbl>, cas_id <chr>
vf.cell.pos.both.hits %>% 
  anti_join(vf.cell.neg.highFA.hits, by = "compound_full") %>% 
  anti_join(vf.cell.neg.lowFA.hits, by = "compound_full") %>% 
  anti_join(vf.cell.neg.both.hits, by = "compound_full") %>% 
  nrow() / 2
## [1] 4
vf.cell.pos.lowFA.hits %>% 
  anti_join(vf.cell.neg.both.hits, by = "compound_full") %>% 
  anti_join(vf.cell.neg.lowFA.hits, by = "compound_full") %>% 
  anti_join(vf.cell.neg.highFA.hits, by = "compound_full") %>% 
  nrow()
## [1] 16

4.7 Write to csv

(not evaluated)

#write_csv(vf.cell.neg.modSV.resid, path = "./results/modsv_resid/vpa_fa_p5_cell_neg_modSV_resid.csv")
#write_csv(vf.cell.pos.modSV.resid, path = "./results/modsv_resid/vpa_fa_p5_cell_pos_modSV_resid.csv")
#write_csv(vf.med.neg.smpl.data, path = "./results/modsv_resid/vpa_fa_p5_media_neg_smpl_data.csv")
# write_csv(vf.med.pos.modSV.resid, path = "./results/modsv_resid/vpa_fa_p5_med_pos_modSV_resid.csv")
# write_csv(vf.med.neg.modSV.resid, path = "./results/modsv_resid/vpa_fa_p5_med_neg_modSV_resid.csv")

5 Session Info

sessionInfo()
R version 3.5.3 (2019-03-11)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggthemes_4.1.0      ggridges_0.5.1      biobroom_1.14.0    
 [4] broom_0.5.1         limma_3.38.3        sva_3.30.0         
 [7] BiocParallel_1.16.2 genefilter_1.64.0   mgcv_1.8-28        
[10] nlme_3.1-137        heatmaply_0.15.2    viridis_0.5.1      
[13] viridisLite_0.3.0   plotly_4.8.0        GGally_1.4.0       
[16] cowplot_0.9.4       forcats_0.4.0       stringr_1.4.0      
[19] dplyr_0.8.0.1       purrr_0.3.2         readr_1.3.1        
[22] tidyr_0.8.3         tibble_2.1.1        ggplot2_3.1.0      
[25] tidyverse_1.2.1    

loaded via a namespace (and not attached):
  [1] colorspace_1.4-1     class_7.3-15         modeltools_0.2-22   
  [4] mclust_5.4.3         rstudioapi_0.10      flexmix_2.3-15      
  [7] bit64_0.9-7          fansi_0.4.0          AnnotationDbi_1.44.0
 [10] mvtnorm_1.0-10       lubridate_1.7.4      xml2_1.2.0          
 [13] codetools_0.2-16     splines_3.5.3        robustbase_0.93-4   
 [16] knitr_1.22           jsonlite_1.6         annotate_1.60.0     
 [19] cluster_2.0.7-1      kernlab_0.9-27       shiny_1.2.0         
 [22] compiler_3.5.3       httr_1.4.0           backports_1.1.3     
 [25] assertthat_0.2.1     Matrix_1.2-16        lazyeval_0.2.2      
 [28] cli_1.1.0            later_0.8.0          htmltools_0.3.6     
 [31] tools_3.5.3          gtable_0.2.0         glue_1.3.1          
 [34] reshape2_1.4.3       Rcpp_1.0.1           Biobase_2.42.0      
 [37] cellranger_1.1.0     trimcluster_0.1-2.1  gdata_2.18.0        
 [40] crosstalk_1.0.0      iterators_1.0.10     fpc_2.1-11.1        
 [43] xfun_0.5             rvest_0.3.2          mime_0.6            
 [46] gtools_3.8.1         XML_3.98-1.19        dendextend_1.10.0   
 [49] DEoptimR_1.0-8       MASS_7.3-51.1        scales_1.0.0        
 [52] TSP_1.1-6            promises_1.0.1       hms_0.4.2           
 [55] parallel_3.5.3       RColorBrewer_1.1-2   yaml_2.2.0          
 [58] memoise_1.1.0        gridExtra_2.3        reshape_0.8.8       
 [61] stringi_1.4.3        RSQLite_2.1.1        gclus_1.3.2         
 [64] S4Vectors_0.20.1     foreach_1.4.4        seriation_1.2-3     
 [67] caTools_1.17.1.2     BiocGenerics_0.28.0  matrixStats_0.54.0  
 [70] rlang_0.3.2          pkgconfig_2.0.2      prabclus_2.2-7      
 [73] bitops_1.0-6         evaluate_0.13        lattice_0.20-38     
 [76] labeling_0.3         htmlwidgets_1.3      bit_1.1-14          
 [79] tidyselect_0.2.5     plyr_1.8.4           magrittr_1.5        
 [82] R6_2.4.0             IRanges_2.16.0       gplots_3.0.1.1      
 [85] generics_0.0.2       DBI_1.0.0            pillar_1.3.1        
 [88] haven_2.1.0          whisker_0.3-2        withr_2.1.2         
 [91] survival_2.43-3      RCurl_1.95-4.12      nnet_7.3-12         
 [94] modelr_0.1.4         crayon_1.3.4         utf8_1.1.4          
 [97] KernSmooth_2.23-15   rmarkdown_1.12       grid_3.5.3          
[100] readxl_1.3.1         data.table_1.12.0    blob_1.1.1          
[103] digest_0.6.18        diptest_0.75-7       webshot_0.5.1       
[106] xtable_1.8-3         httpuv_1.5.0         stats4_3.5.3        
[109] munsell_0.5.0        registry_0.5-1